/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.examples;

import java.util.ArrayList;
import java.util.List;

import mosdi.fa.Alphabet;
import mosdi.fa.CDFA;
import mosdi.fa.DFAFactory;
import mosdi.fa.FiniteMemoryTextModel;
import mosdi.fa.GeneralizedString;
import mosdi.fa.IIDTextModel;
import mosdi.paa.MatchCountDAA;
import mosdi.paa.PAA;
import mosdi.paa.TextBasedPAA;
import mosdi.util.Log;

public class PValue {

	public static void main(String[] args) {
		// first 100 nt of c. glutamicum genome
		String genome = "GTGAGCCAGAACTCATCTTCTTTGCTCGAAACCTGGCGCCAAGTTGTTGCCGATCTCACAACTTTGAGCCAGCAAGCGGACAGTGGATTCGACCCATTGA"; 
		
		Log.setTimingActive(true);
		Log.setLogLevel(Log.Level.VERBOSE);

		// the alphabet of nucleotides
		Alphabet dnaAlphabet =  Alphabet.getDnaAlphabet();
		// calculate empiric distribution from letter frequencies
		FiniteMemoryTextModel textModel = new IIDTextModel(dnaAlphabet, genome);
		
		// create generalized string for a motif and its complement ...
		GeneralizedString p1 =  new GeneralizedString(dnaAlphabet, "GA?C");
		GeneralizedString p2 =  new GeneralizedString(dnaAlphabet, "G?TC");
		// ... and put them into a list
		List<GeneralizedString> l = new ArrayList<GeneralizedString>(2);
		l.add(p1);
		l.add(p2);
		
		// build cdfa (=counting deterministic finite automaton) from motifs
		CDFA cdfa = DFAFactory.build(dnaAlphabet, l);
		// optional step: minimize number of states (speeds up calculations)
		cdfa = cdfa.minimize();

		// count number of matches
		int matches = cdfa.countMatches(genome);
		
		MatchCountDAA daa = new MatchCountDAA(cdfa, matches);
		PAA paa = new TextBasedPAA(daa, textModel);
		Log.printf(Log.Level.VERBOSE, "PAA has %d states.%n", paa.getStateCount());

		// this steps takes O(#matches * |genome| * #dfa-states)
		double pvalue = paa.computeValueDistribution(genome.length())[matches];
		
		Log.printf(Log.Level.STANDARD, "dfa states: %d, matches: %d, p-value: %e%n", cdfa.getStateCount(), matches, pvalue);
	}
}
